#import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import dataset
file_path = "C:/PERSONAL/Personal/PROJ/NTI/Python/Final Project/Final Project/anime.csv"
anime_df = pd.read_csv(file_path)
#Check the head of the DataFrame
anime_df.head()
| Score | Popularity | Rank | Members | Description | Synonyms | Japanese | English | Type | Episodes | ... | Premiered | Broadcast | Producers | Licensors | Studios | Source | Genres | Demographic | Duration | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9.38 | 284.0 | 1.0 | 710.0 | During their decade-long quest to defeat the D... | Frieren at the Funeral | 葬送のフリーレン | Frieren: Beyond Journey's End | TV | 28 | ... | Fall 2023 | Fridays at 23:00 (JST) | Aniplex, Dentsu, Shogakukan-Shueisha Productio... | None found, add some | Madhouse | Manga | AdventureAdventure, DramaDrama, FantasyFantasy | ShounenShounen | 24 min. per ep. | PG-13 - Teens 13 or older |
| 1 | 9.09 | 3.0 | 2.0 | 3.0 | After a horrific alchemy experiment goes wrong... | Hagane no Renkinjutsushi: Fullmetal Alchemist,... | 鋼の錬金術師 FULLMETAL ALCHEMIST | Fullmetal Alchemist: Brotherhood | TV | 64 | ... | Spring 2009 | Sundays at 17:00 (JST) | Aniplex, Square Enix, Mainichi Broadcasting Sy... | Funimation, Aniplex of America | Bones | Manga | ActionAction, AdventureAdventure, DramaDrama, ... | ShounenShounen | 24 min. per ep. | R - 17+ (violence & profanity) |
| 2 | 9.07 | 13.0 | 3.0 | 2.0 | Eccentric scientist Rintarou Okabe has a never... | NaN | STEINS;GATE | Steins;Gate | TV | 24 | ... | Spring 2011 | Wednesdays at 02:05 (JST) | Frontier Works, Media Factory, Kadokawa Shoten... | Funimation | White Fox | Visual novel | DramaDrama, Sci-FiSci-Fi, SuspenseSuspense | NaN | 24 min. per ep. | PG-13 - Teens 13 or older |
| 3 | 9.06 | 342.0 | 4.0 | 630.0 | Gintoki, Shinpachi, and Kagura return as the f... | Gintama' (2015) | 銀魂° | Gintama Season 4 | TV | 51 | ... | Spring 2015 | Wednesdays at 18:00 (JST) | TV Tokyo, Aniplex, Dentsu | Funimation, Crunchyroll | Bandai Namco Pictures | Manga | ActionAction, ComedyComedy, Sci-FiSci-Fi | ShounenShounen | 24 min. per ep. | PG-13 - Teens 13 or older |
| 4 | 9.05 | 21.0 | 5.0 | 2.0 | Seeking to restore humanity's diminishing hope... | NaN | 進撃の巨人 Season3 Part.2 | Attack on Titan Season 3 Part 2 | TV | 10 | ... | Spring 2019 | Mondays at 00:10 (JST) | Production I.G, Dentsu, Mainichi Broadcasting ... | Funimation | Wit Studio | Manga | ActionAction, DramaDrama, SuspenseSuspense | ShounenShounen | 23 min. per ep. | R - 17+ (violence & profanity) |
5 rows × 22 columns
#Check the tail of the DataFrame
anime_df.tail()
| Score | Popularity | Rank | Members | Description | Synonyms | Japanese | English | Type | Episodes | ... | Premiered | Broadcast | Producers | Licensors | Studios | Source | Genres | Demographic | Duration | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 996 | 7.8 | 1658.0 | 997.0 | 136.0 | Not so long ago, mysterious structures called ... | Magi: Adventure of Sinbad OVA | マギ シンドバッドの冒険 | NaN | OVA | 5 | ... | NaN | NaN | Aniplex, Shogakukan | None found, add some | Lay-duce | Manga | ActionAction, AdventureAdventure, FantasyFantasy | ShounenShounen | 24 min. per ep. | PG-13 - Teens 13 or older |
| 997 | 7.8 | 3114.0 | 998.0 | 50.0 | Fifteen years ago in a barren stretch of the P... | NaN | 名探偵コナン 水平線上の陰謀 | Detective Conan Movie 09: Strategy Above the D... | Movie | 1 | ... | NaN | NaN | None found, add some | None found, add some | TMS Entertainment | Manga | AdventureAdventure, ComedyComedy, MysteryMystery | ShounenShounen | 1 hr. 50 min. | PG-13 - Teens 13 or older |
| 998 | 7.8 | 3194.0 | 999.0 | 47.0 | Serial murders involving all kinds have happen... | Meitantei Conan: Senritsu no Gakufu [Full Score] | 名探偵コナン 戦慄の楽譜(フルスコア) | Detective Conan Movie 12: Full Score of Fear | Movie | 1 | ... | NaN | NaN | Shogakukan-Shueisha Productions, Tokyo Movie S... | None found, add some | TMS Entertainment | Manga | AdventureAdventure, ComedyComedy, MysteryMystery | ShounenShounen | 1 hr. 56 min. | PG-13 - Teens 13 or older |
| 999 | 7.8 | 3598.0 | 1000.0 | 38.0 | One foggy morning, a black and white Toyota AE... | Shin Gekijouban Initial D: Legend 1 - Kakusei | 新劇場版 頭文字[イニシャル]D Legend1 -覚醒- | Initial D Legend 1 Awakening | Movie | 1 | ... | NaN | NaN | Shochiku, Kodansha, Ultra Super Pictures, Avex... | Sentai Filmworks | SANZIGEN, LIDENFILMS | Manga | NaN | SeinenSeinen | 1 hr. 2 min. | PG-13 - Teens 13 or older |
| 1000 | Total | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1000 |
5 rows × 22 columns
#rows-cols
anime_df.shape
(1001, 22)
#Details
anime_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1001 entries, 0 to 1000 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Score 1001 non-null object 1 Popularity 1000 non-null float64 2 Rank 1000 non-null float64 3 Members 1000 non-null float64 4 Description 1000 non-null object 5 Synonyms 709 non-null object 6 Japanese 999 non-null object 7 English 859 non-null object 8 Type 1000 non-null object 9 Episodes 1000 non-null object 10 Status 1000 non-null object 11 Aired 1000 non-null object 12 Premiered 569 non-null object 13 Broadcast 569 non-null object 14 Producers 1000 non-null object 15 Licensors 1000 non-null object 16 Studios 1000 non-null object 17 Source 1000 non-null object 18 Genres 771 non-null object 19 Demographic 521 non-null object 20 Duration 1000 non-null object 21 Rating 1001 non-null object dtypes: float64(3), object(19) memory usage: 172.2+ KB
#dataset Statistics
anime_df.describe()
| Popularity | Rank | Members | |
|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.000000 |
| mean | 1805.637000 | 500.500000 | 223.648000 |
| std | 1888.308553 | 288.819436 | 246.288299 |
| min | 1.000000 | 1.000000 | 1.000000 |
| 25% | 413.750000 | 250.750000 | 31.750000 |
| 50% | 1139.500000 | 500.500000 | 132.000000 |
| 75% | 2633.750000 | 750.250000 | 330.000000 |
| max | 12043.000000 | 1000.000000 | 998.000000 |
#Column names
anime_df.columns
Index(['Score', 'Popularity', 'Rank', 'Members', 'Description', 'Synonyms',
'Japanese', 'English', 'Type', 'Episodes', 'Status', 'Aired',
'Premiered', 'Broadcast', 'Producers', 'Licensors', 'Studios', 'Source',
'Genres', 'Demographic', 'Duration', 'Rating'],
dtype='object')
#Check Null values and hundle it
null_counts = anime_df.isnull().sum()
print("Null values per column:\n", null_counts)
Null values per column: Score 0 Popularity 1 Rank 1 Members 1 Description 1 Synonyms 292 Japanese 2 English 142 Type 1 Episodes 1 Status 1 Aired 1 Premiered 432 Broadcast 432 Producers 1 Licensors 1 Studios 1 Source 1 Genres 230 Demographic 480 Duration 1 Rating 0 dtype: int64
# Convert the 'Episodes' column to numeric, forcing non-numeric values to NaN
anime_df['Episodes'] = pd.to_numeric(anime_df['Episodes'], errors='coerce')
# Fill NaN values in 'Episodes' with the median after conversion
anime_df['Episodes'] = anime_df['Episodes'].fillna(anime_df['Episodes'].median())
# Fill with common values or placeholders for columns with 1-2 missing entries
anime_df['Popularity'] = anime_df['Popularity'].fillna(anime_df['Popularity'].median())
anime_df['Rank'] = anime_df['Rank'].fillna(anime_df['Rank'].mean())
anime_df['Members'] = anime_df['Members'].fillna(anime_df['Members'].median())
anime_df['Description'] = anime_df['Description'].fillna('No Description Available')
anime_df['Type'] = anime_df['Type'].fillna('Unknown')
anime_df['Episodes'] = anime_df['Episodes'].fillna(anime_df['Episodes'].median())
anime_df['Status'] = anime_df['Status'].fillna('Unknown')
anime_df['Aired'] = anime_df['Aired'].fillna('Unknown')
anime_df['Producers'] = anime_df['Producers'].fillna('Unknown')
anime_df['Licensors'] = anime_df['Licensors'].fillna('Unknown')
anime_df['Studios'] = anime_df['Studios'].fillna('Unknown')
anime_df['Source'] = anime_df['Source'].fillna('Unknown')
anime_df['Duration'] = anime_df['Duration'].fillna('Unknown')
# Fill missing 'English' titles with 'Unknown'
anime_df['English'] = anime_df['English'].fillna('Unknown')
# Fill missing 'Genres' with 'Not Specified'
anime_df['Genres'] = anime_df['Genres'].fillna('Not Specified')
#Check Duplicate and hundle it
duplicate_count = anime_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
# Remove duplicate rows
anime_df_cleaned = anime_df_cleaned.drop_duplicates()
Number of duplicate rows: 0
#Check Null values
null_counts = anime_df.isnull().sum()
print("Null values per column:\n", null_counts)
Null values per column: Score 0 Popularity 0 Rank 0 Members 0 Description 0 Synonyms 292 Japanese 2 English 0 Type 0 Episodes 0 Status 0 Aired 0 Premiered 432 Broadcast 432 Producers 0 Licensors 0 Studios 0 Source 0 Genres 0 Demographic 480 Duration 0 Rating 0 dtype: int64
#Drop unnecessary columns
anime_df_cleaned = anime_df.drop(columns=['Synonyms', 'Premiered', 'Broadcast',
'Demographic' , 'Japanese'])
# 1) Top 10 Anime by Popularity
# Sort by 'Popularity' and select the top 10
top_10_anime_popularity = anime_df_cleaned.sort_values(by='Popularity').head(10)
# Plot
plt.figure(figsize=(12, 8))
sns.barplot(data=top_10_anime_popularity, x='English', y='Popularity', palette='viridis')
plt.xticks(rotation=45, ha='right')
plt.title('Top 10 Anime by Popularity')
plt.xlabel('Anime (English)')
plt.ylabel('Popularity')
plt.show()
# 2)Score distribution
# Convert Score to numeric, handling errors by setting invalid parsing as NaN
anime_df_cleaned['Score'] = pd.to_numeric(anime_df_cleaned['Score'], errors='coerce')
# Plot
plt.figure(figsize=(10, 6))
sns.histplot(anime_df_cleaned['Score'], bins=20, kde=True)
plt.title('Score Distribution')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()
######## 3) Yearwise Members ############
# TOP 7 years by total numbers
# Extract the year from the 'Aired' column and create a new 'Year' column
anime_df_cleaned['Year'] = anime_df_cleaned['Aired'].str.extract(r'(\d{4})').astype(float)
# Group by 'Year' and sum 'Members', then get the top 7 years
yearly_members = anime_df_cleaned.groupby('Year')['Members'].sum().nlargest(7)
# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=yearly_members.index, y=yearly_members.values, palette='Blues')
plt.title('Top 7 Years by Total Members')
plt.xlabel('Year')
plt.ylabel('Total Members')
plt.show()
# 4)Type
#Output::
#Type
#TV 569
#Movie 235
#OVA 84
# Count occurrences of each type
type_counts = anime_df_cleaned['Type'].value_counts()
TV 569 Movie 235 OVA 84 ONA 53 Special 37 TV Special 22 Unknown 1 Name: Type, dtype: int64
# Bar plot
plt.figure(figsize=(8, 5))
sns.barplot(x=type_counts.index, y=type_counts.values, palette='pastel')
plt.title('Distribution of Anime Types')
plt.xlabel('Type')
plt.ylabel('Count')
plt.show()
#Pie Plot
plt.figure(figsize=(8, 8))
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
plt.title('Anime Types Distribution')
plt.show()
# 5) Genres
#Output::
#Genres
#ActionAction, AdventureAdventure, FantasyFantasy 43
#ComedyComedy, RomanceRomance 28
#ActionAction, FantasyFantasy 23
#ActionAction, Sci-FiSci-Fi 22
#DramaDrama, RomanceRomance 21
# Count the occurrences of each unique genre combination in 'Genres' and get the top 10
genre_combinations = anime_df_cleaned['Genres'].value_counts().nlargest(10)
print(genre_combinations)
Not Specified 230 ActionAction, AdventureAdventure, FantasyFantasy 43 ComedyComedy, RomanceRomance 28 ActionAction, FantasyFantasy 23 ActionAction, Sci-FiSci-Fi 22 DramaDrama, RomanceRomance 21 ActionAction, DramaDrama, Sci-FiSci-Fi 21 ActionAction, ComedyComedy, Sci-FiSci-Fi 15 ActionAction, MysteryMystery, SupernaturalSupernatural 15 AdventureAdventure, ComedyComedy, MysteryMystery 14 Name: Genres, dtype: int64
# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=genre_combinations.index, y=genre_combinations.values, palette='magma')
plt.title('Top Genres in Anime')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.show()
# 6) Top Studios
# Count the number of anime by each studio
studio_counts = anime_df_cleaned['Studios'].str.split(',').explode().value_counts().nlargest(10)
studio_counts
Madhouse 59 Production I.G 58 Sunrise 52 TMS Entertainment 49 Bones 41 Toei Animation 40 A-1 Pictures 36 Kyoto Animation 36 J.C.Staff 33 Shaft 31 Name: Studios, dtype: int64
# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=studio_counts.index, y=studio_counts.values, palette='cividis')
plt.title('Top 10 Anime Studios')
plt.xlabel('Studio')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.show()
# 7)Source
#Output::
#Source
#Manga 547
#Original 160
#Light novel 110
#Novel 63
source_counts = anime_df_cleaned['Source'].value_counts()
source_counts
Manga 547 Original 160 Light novel 110 Novel 63 Web manga 32 Visual novel 24 4-koma manga 20 Web novel 14 Game 14 Other 6 Mixed media 6 Unknown 2 Book 1 Music 1 Picture book 1 Name: Source, dtype: int64
# Pie plot
plt.figure(figsize=(8, 8))
plt.pie(source_counts, labels=source_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Paired'))
plt.title('Anime Sources Distribution')
plt.show()
import plotly.express as px
# Top 10 Anime by Popularity
top_10_anime = anime_df_cleaned.nlargest(10, 'Popularity')[['English', 'Popularity']]
fig1 = px.bar(top_10_anime, x='English', y='Popularity', title="Top 10 Anime by Popularity",
labels={'English': 'Anime Name', 'Popularity': 'Popularity Rank'},
color='Popularity', hover_data={'Popularity': ':.0f'})
fig1.update_layout(xaxis_title="Anime Name", yaxis_title="Popularity Rank")
fig1.show()
# Score Distribution
fig2 = px.histogram(anime_df_cleaned, x='Score', nbins=20, title="Score Distribution",
labels={'Score': 'Anime Score'},
marginal="box", hover_data={'Score': ':.2f'})
fig2.update_layout(xaxis_title="Score", yaxis_title="Count")
fig2.show()
# Extract year and group by it for total members
anime_df_cleaned['Year'] = pd.to_datetime(anime_df_cleaned['Aired'], errors='coerce').dt.year
yearly_members = anime_df_cleaned.groupby('Year')['Members'].sum().nlargest(7)
fig3 = px.line(yearly_members, x=yearly_members.index, y='Members', title="Top 7 Years by Total Members",
labels={'Year': 'Year', 'Members': 'Total Members'},
markers=True)
fig3.update_layout(xaxis_title="Year", yaxis_title="Total Members")
fig3.show()
# Type Distribution
type_counts = anime_df_cleaned['Type'].value_counts()
fig4 = px.pie(type_counts, values=type_counts, names=type_counts.index, title="Distribution of Anime Types")
fig4.show()
# Top Genre Combinations
genre_combinations = anime_df_cleaned['Genres'].value_counts().nlargest(10)
fig5 = px.bar(genre_combinations, x=genre_combinations.index, y=genre_combinations,
title="Top Genre Combinations", labels={'y': 'Count', 'index': 'Genres'})
fig5.update_layout(xaxis_title="Genre Combinations", yaxis_title="Count")
fig5.show()
# Average Score by Anime Source
fig6 = px.box(anime_df_cleaned, x='Source', y='Score', title="Score Distribution by Source",
labels={'Source': 'Source of Anime', 'Score': 'Average Score'},
color='Source')
fig6.update_layout(xaxis_title="Source", yaxis_title="Score")
fig6.show()
# Trend of Anime Production Over Time
anime_per_year = anime_df_cleaned['Year'].value_counts().sort_index()
fig7 = px.area(x=anime_per_year.index, y=anime_per_year, title="Trend of Anime Production Over Time",
labels={'x': 'Year', 'y': 'Number of Anime Produced'})
fig7.update_layout(xaxis_title="Year", yaxis_title="Count")
fig7.show()
# Popularity vs. Score by Anime Type
fig9 = px.scatter(anime_df_cleaned, x='Score', y='Popularity', color='Type',
title="Popularity vs. Score by Anime Type",
labels={'Score': 'Score', 'Popularity': 'Popularity Rank', 'Type': 'Anime Type'},
hover_data=['English'])
fig9.update_layout(xaxis_title="Score", yaxis_title="Popularity Rank")
fig9.show()
# Calculate the top 10 studios by average score
studio_avg_score = anime_df_cleaned.groupby('Studios')['Score'].mean().nlargest(10)
fig11 = px.bar(studio_avg_score, x=studio_avg_score.index, y='Score',
title="Top 10 Studios by Average Anime Score",
labels={'x': 'Studio', 'Score': 'Average Score'})
fig11.update_layout(xaxis_title="Studio", yaxis_title="Average Score")
fig11.show()
# Ensure 'Genres' and 'Average_Episode_Duration' columns are present after splitting
genre_duration_df = anime_df_cleaned.dropna(subset=['Genres', 'Average_Episode_Duration'])
genre_duration_df = genre_duration_df.assign(Genre=genre_duration_df['Genres'].str.split(',')).explode('Genre')
# Group by 'Genre' and calculate the mean of 'Average_Episode_Duration'
avg_duration_by_genre = genre_duration_df.groupby('Genre')['Average_Episode_Duration'].mean().nlargest(10)
# Plotting with Plotly
fig10 = px.bar(avg_duration_by_genre, x=avg_duration_by_genre.index, y='Average_Episode_Duration',
title="Average Episode Duration by Genre",
labels={'x': 'Genre', 'Average_Episode_Duration': 'Average Duration (min)'})
fig10.update_layout(xaxis_title="Genre", yaxis_title="Average Duration (minutes)")
fig10.show()
import plotly.express as px
# Treemap to show Popularity by Genre and Type
fig = px.treemap(
anime_df_cleaned,
path=['Genres', 'Type'], # Hierarchical path
values='Popularity', # Size by Popularity
color='Score', # Color by Score to show high-scoring genres and types
hover_data={'Popularity': True, 'Score': True},
title="Treemap of Popularity by Genre and Type"
)
fig.update_traces(textinfo="label+value+percent parent")
fig.show()
C:\Users\assem\anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\assem\anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
import plotly.graph_objects as go
import pandas as pd
# Group data to get average score by Type and Rating
heatmap_data = anime_df_cleaned.groupby(['Type', 'Rating'])['Score'].mean().unstack()
# Heatmap of Average Score by Type and Rating
fig = go.Figure(
data=go.Heatmap(
z=heatmap_data.values,
x=heatmap_data.columns,
y=heatmap_data.index,
colorscale='Cividis',
colorbar=dict(title="Average Score")
)
)
fig.update_layout(
title="Average Score by Type and Rating",
xaxis_title="Rating",
yaxis_title="Type"
)
fig.show()
# Sunburst chart to show distribution by Studio, Genre, and Type
fig = px.sunburst(
anime_df_cleaned,
path=['Studios', 'Genres', 'Type'], # Hierarchical path
values='Score', # Size by Score to highlight higher-rated entities
color='Popularity', # Color by Popularity for insight into popular studios, genres, and types
hover_data={'Score': True, 'Popularity': True},
title="Sunburst of Studios, Genres, and Type by Score"
)
fig.update_traces(textinfo="label+percent entry")
fig.show()
C:\Users\assem\anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\assem\anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\assem\anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.